############### ###############
## 02 - CBO group preparation for analysis
## Project: CBO
## PI: Malte Lierl
## Written by: Kamil Kouhen
## Purpose: Making last adjustments and adding useful variables before analysis
## Date of creation: 07/01/2022
############### ###############

library(here)
#Running preceding code
#source(here("Code","Rcode", "01 - preparation - CBO group cleaning.R"), echo = T) #Cleaning of CBO group ds

#Cleaned dataframe
CBO_groups_intermediate <- readRDS(here(datatype, "Intermediate", "CBO_groups_intermediate.RDS"))

### Restricting to sample of CBOs in municipalities that received scorecards and received municipal training ###
CBO_groups_final <- CBO_groups_intermediate %>%
  filter(Municipality_included_MON == 1) %>%
  na_if("Don't Know") #One important first step for final data: na if factor variable value is "Don't Know"

### For Hypothesis 1: A financial stake in municipal government performance increases CBOs’ involvement in municipal governance. ###

## A_Breadth_of_engagement: Total number of different types of engagement the CBO undertook
todrop_checkingna <- CBO_groups_final %>% 
  dplyr::select(meet_cm, meet_cdc, meet_sg_num, meet_admin_num, meet_mayor_num, meet_councilor_num, meet_prefecture_any,
                meet_health_total, meet_education_total, meet_water_total, meet_village, research_data, research_opinion, 
                research_interviews, organize_villagemeeting_num, organize_collab_num, organize_stakeholder_num, organize_cbo, 
                organize_socialmedia, pressure_campaign_num, pressure_media) %>%
  colnames
#if (sum(is.na(CBO_groups_final[todrop_checkingna] %>% select_if(is.numeric))) != 0) print("There are missing values, be careful when creating new variable")
sumstats(CBO_groups_final[todrop_checkingna]) #1 or 2 real missing values for meet_sg_num, meet_mayor_num, and meet_councilor_num
rm(todrop_checkingna)

#Note# Because there are only a handful real missing values out of all these variables, I choose to ignore them and mark it as 0
# Need @Malte approval

#Note# The distribution of the variables has also been checked for low variance etc... they all seem correct.
CBO_groups_final %<>% #Need to create the variable by hand to avoid mistakes
  mutate(
    bin_hyp1_cm = as.numeric(as.character(ifelse(meet_cm > 0, 1, 0))), #Went but weren't invited
    bin_hyp1_cdc = as.numeric(as.character(ifelse(meet_cdc > 0, 1, 0))), #Went but weren't invited
    bin_hyp1_sg = as.numeric(as.character(ifelse(meet_sg_num > 0, 1, 0))), 
    bin_hyp1_admin = as.numeric(as.character(ifelse(meet_admin_num > 0, 1, 0))), 
    bin_hyp1_mayor = as.numeric(as.character(ifelse(meet_mayor_num > 0, 1, 0))),
    bin_hyp1_councilor = as.numeric(as.character(ifelse(meet_councilor_num > 0, 1, 0))),
    bin_hyp1_prefecture =  as.numeric(as.character(ifelse(meet_prefecture_any == 1, 1, 0))),
    bin_hyp1_health = as.numeric(as.character(ifelse(meet_health_total > 0, 1, 0))),
    bin_hyp1_education = as.numeric(as.character(ifelse(meet_education_total > 0, 1, 0))),
    bin_hyp1_water = as.numeric(as.character(ifelse(meet_water_total > 0, 1, 0))),
    bin_hyp1_village = as.numeric(as.character(ifelse(meet_village > 0, 1, 0))),
    bin_hyp1_research_data = as.numeric(as.character(ifelse(research_data == 1, 1, 0))),
    bin_hyp1_research_opinion = as.numeric(as.character(ifelse(research_opinion == 1, 1, 0))),
    bin_hyp1_research_interviews = as.numeric(as.character(ifelse(research_interviews == 1, 1, 0))),
    bin_hyp1_village_meeting = as.numeric(as.character(ifelse(organize_villagemeeting_num > 0, 1, 0))),
    bin_hyp1_collab_num = as.numeric(as.character(ifelse(organize_collab_num > 0, 1, 0))),
    bin_hyp1_stakeholder_num = as.numeric(as.character(ifelse(organize_stakeholder_num > 0, 1, 0))),
    bin_hyp1_organize_cbo = as.numeric(as.character(ifelse(organize_cbo == 1, 1, 0))),
    bin_hyp1_social_media = as.numeric(as.character(ifelse(organize_socialmedia == 1, 1, 0))),
    bin_hyp1_pressure_campaign_num  = as.numeric(as.character(ifelse(pressure_campaign_num  > 0, 1, 0))), #For public pressure, only using pressure_campaign_num because very low numbers for other category
    bin_hyp1_pressure_media  = as.numeric(as.character(ifelse(pressure_media == 1, 1, 0)))
  ) %>% 
  mutate(A_Breadth_of_engagement = rowSums(across(starts_with("bin_hyp1")),
                                           na.rm = TRUE)) %>%  
  set_variable_labels(A_Breadth_of_engagement = "Total number of different types of engagement the CBO undertook (see list in 'variable classification.xlsx').") 

  #Note# Will label indicator variables later

sumstats(CBO_groups_final$A_Breadth_of_engagement) #Controlling, all good
label(CBO_groups_final$A_Breadth_of_engagement) #Controlling, all good

reporting.changes(CBO_groups_final, #Ad-hoc to report adding the variable into here("Output", "For Cleaning", "Changes made to datasets.xlsx")
                  "A_Breadth_of_engagement", 
                  describe = "New variable: Total number of different types of engagement the CBO undertook (meet_cm, meet_cdc, meet_sg_num, meet_admin_num, meet_mayor_num, meet_councilor_num, meet_prefecture_any, meet_health_total, meet_education_total, meet_water_total, meet_village, research_data, research_opinion, research_interviews, organize_villagemeeting_num, organize_collab_num, organize_stakeholder_num, organize_cbo, organize_socialmedia, pressure_campaign_num, pressure_media)")

## A_Intensity_of_engagement: Z-score average index representing intensity of CBOs' involvement in municipal governance.

#Inspecting index components (individual original variables)
todrop_checkingna <- CBO_groups_final %>% 
  dplyr::select(meet_cm, meet_cdc, meet_sg_num, meet_admin_num, meet_mayor_num, meet_councilor_num, 
                meet_water_total, meet_health_total, meet_education_total, 
                organize_stakeholder_num, organize_collab_num, organize_villagemeeting_num, pressure_campaign_num) %>%
  colnames
#if (sum(is.na(CBO_groups_final[todrop_checkingna] %>% select_if(is.numeric))) != 0) stop("There are missing values, be careful when creating new variable")
sumstats(CBO_groups_final[todrop_checkingna]) #1 or 2 real missing values for meet_sg_num, meet_mayor_num, and meet_councilor_num
rm(todrop_checkingna)

#Aggregating some of the sub-components by type of engagement or decision maker met
CBO_groups_final %<>%
  rowwise() %>% 
  mutate(
    sum_elected = sum(meet_mayor_num, meet_councilor_num, meet_cm, meet_cdc, na.rm = TRUE),
    sum_unelected = sum(meet_sg_num, meet_admin_num, na.rm = TRUE),
    sum_servicep = sum(meet_health_total, meet_education_total, meet_water_total, na.rm = TRUE)) %>% 
  ungroup()
  #Note# The rest of the index sub-components will be integrated individually

#Setting components' labels
CBO_groups_final %<>% 
  set_variable_labels(sum_elected = "Sum of meetings with elected officials or bodies (mayor + councilor + cm + cdc)",
                      sum_unelected = "Sum of meetings with unelected officials (sg + admin)")

#Grouping index components
includedvars <- CBO_groups_final %>% dplyr::select( #creating matrix of variables for simplicity
  sum_elected, 
  sum_unelected, 
  sum_servicep, 
  organize_collab_num, 
  organize_villagemeeting_num, 
  organize_stakeholder_num, 
  pressure_campaign_num
) %>% colnames

sumstats(CBO_groups_final[includedvars]) #To see total number of missing values: only 3 NAs left (replacing by mean)

  #Note# These 3 missing values are "Don't Know" missing values originally. We make the assumption that if the CBOs didn't know, the number of meetings was
       # 0 or close to 0. I replace them by 0 when building the index.

#Z-score average
CBO_groups_final %<>% 
  mutate(across(all_of(includedvars), ~ scale(.), .names = "{col}_std_todrop")) %>%
  rowwise() %>%
  mutate(A_Intensity_of_engagement = sum(across(ends_with("_std_todrop")), na.rm =TRUE)/length(includedvars)) %>%
  ungroup() %>%
  set_variable_labels(A_Intensity_of_engagement = "Z-score average index representing intensity of CBOs' involvement in municipal governance.") %>%
  select(-contains("_std_todrop"))
  
reporting.changes(CBO_groups_final, #Ad-hoc to report adding the variable into here("Output", "For Cleaning", "Changes made to datasets.xlsx")
                    "A_Intensity_of_engagement", 
                    describe = "New variable: Z-score average index representing intensity of CBOs' involvement in municipal governance. (see list in 'variable classification.xlsx')")

## (for appendix) sum_servicep: sum of meetings with all types of service providers (water, educ, health)
CBO_groups_final %<>% #Already created above for intensity variable
  set_variable_labels(sum_servicep = "Sum of meetings with all types of service providers (water, educ, health).")

reporting.changes(CBO_groups_final, #Ad-hoc to report adding the variable into here("Output", "For Cleaning", "Changes made to datasets.xlsx")
                  "sum_servicep", 
                  describe = "New variable (in appendix): Sum of meetings with all types of service providers (water, educ, health).")

### For Hypothesis 2 - A_log_members_total_after: (log) Estimated number of CBO members after intervention (at time of data collec.), excluding deaths. ### 
CBO_groups_final %<>% 
  rowwise() %>%
  mutate(B_members_total_before = (members_total + members_left + members_died - members_joined)) %>%
  mutate(B_members_total_before = ifelse(B_members_total_before <= 0, NA, B_members_total_before)) %>% #If old number is negative (1 observation) or 0 (1 observation), equals 1
  mutate(B_members_total_after = (members_total + members_died)) %>%
  ungroup() %>%
  mutate(A_log_members_total_before = log(B_members_total_before)) %>%
  mutate(A_log_members_total_after = log(B_members_total_after)) %>%
  set_variable_labels(A_log_members_total_before = "(log) Estimated number of CBO members in 2020", 
                      A_log_members_total_after = "(log) Estimated number of CBO members after intervention 
                      (at time of data collec.), excluding deaths.")

  #The two observations with appcode == 1_7_6_4 & 8_26_6_1 are put as NA. The numbers don't make sense, the 
  #old total is negative or zero, while baseline data shows total over 10. 

reporting.changes(CBO_groups_final, #Ad-hoc to report adding the variable into here("Output", "For Cleaning", "Changes made to datasets.xlsx")
                  "A_log_members_total_after", 
                  describe = "(log) Estimated number of CBO members after intervention (at time of data collec.), 
                  excluding deaths. The two observations with appcode == 1_7_6_4 & 8_26_6_1 are put as NA. 
                  Their numbers don't make sense, the old total is negative or zero, while baseline data shows total over 10.")

#Checking if all is good
#if (sum(is.infinite(CBO_groups_final$A_log_members_total_after)) > 0) stop("There are some infinite values in A_log_member_change, please correct.")


### For Hypothesis 2: CBO has a budget ### 
#Using original variable: budget_any

### For Hypothesis 2 - meetings_general : Total number of general membership meetings ### 
#Using original variable: meetings_general

### For Hypothesis 2 - meetings_executive : Total number of executive board meetings ### 
#Using original variable: meetings_executive

### For Hypothesis 2 - covid_policy : CBO has covid response ###
#Using original variable: covid_policy

### For Hypothesis 2 - covid_adaptation : Covid-19 led CBO to adapt its activities ###
CBO_groups_final %<>% 
  mutate(A_covid_adaptation = as.numeric(covid_adaptation)) %>%
  mutate(A_covid_adaptation = 
           case_when(
             A_covid_adaptation == 1 ~ 0,
             A_covid_adaptation == 2 ~ 1,
             A_covid_adaptation == 3 ~ 2,
             A_covid_adaptation == 4 ~ 3,
           )) 

### For Hypothesis 2 - A_game_group_contrib : Share of amount dedicated to sharing with other members in decision game  ###
  
  #Note# A game was run with each individual that distributed an fixed amount, individuals could decide to keep the whole amount or share some with the group. 
       # Variables in wide format: One variable for each interview (until max number of interviewed indi.)
for (i in seq(8)){ #Maximum number of interviewee is 8
  eval(parse(text = paste0("
    CBO_groups_final %<>% 
      rowwise() %>% 
      mutate(A_game_groupcontrib_", i, " = amount_group_", i, "/(amount_group_", i, " + amount_self_", i, "))%>%
      ungroup()
  ")))
}
#Variables are wide, I need to build another dataset by reshaping ds: observations are individuals
CBO_groups_final_IND <- CBO_groups_final %>% 
  select(region, 
         commune, 
         appcode,
         randomization_block,
         situation, 
         contains("A_game_groupcontrib"), 
         contributions_late, 
         contributions_nonpayment, 
         contributions_total,
         budget_total, 
         members_total, 
         ends_with("_BL")) %>%
  pivot_longer(contains("A_game_groupcontrib"), names_to = "reshaped_id", values_to = "A_game_group_contrib")

#Creating weights to compensate for the uneven number of individual per CBO
CBO_groups_final_IND %<>%
  group_by(appcode) %>% #grouping by CBO
  mutate(todrop_countCBO = n()) %>%
  mutate(IND_weight = (1/todrop_countCBO)) %>% 
  ungroup() %>%
  select(-contains("todrop_"))

### Saving intermediate (pre-preparation for analysis) cleaned blinded CBO group dataset ###
saveRDS(CBO_groups_final, file = here(datatype, "Final", "CBO_groups_final.RDS"))
saveRDS(CBO_groups_final_IND, file = here(datatype, "Final", "CBO_groups_final_IND.RDS"))

message("**02 completed")

##########################

